kreuzberg 2.1.2__py3-none-any.whl → 3.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/__init__.py +16 -2
- kreuzberg/_chunker.py +51 -0
- kreuzberg/_constants.py +2 -3
- kreuzberg/_extractors/__init__.py +0 -0
- kreuzberg/_extractors/_base.py +92 -0
- kreuzberg/_extractors/_html.py +34 -0
- kreuzberg/_extractors/_image.py +74 -0
- kreuzberg/_extractors/_pandoc.py +613 -0
- kreuzberg/_extractors/_pdf.py +163 -0
- kreuzberg/_extractors/_presentation.py +233 -0
- kreuzberg/_extractors/_spread_sheet.py +125 -0
- kreuzberg/_mime_types.py +19 -26
- kreuzberg/_ocr/__init__.py +17 -0
- kreuzberg/_ocr/_base.py +54 -0
- kreuzberg/_ocr/_easyocr.py +376 -0
- kreuzberg/_ocr/_paddleocr.py +291 -0
- kreuzberg/_ocr/_tesseract.py +342 -0
- kreuzberg/_playa.py +276 -0
- kreuzberg/_registry.py +108 -0
- kreuzberg/_types.py +133 -36
- kreuzberg/_utils/__init__.py +0 -0
- kreuzberg/{_string.py → _utils/_string.py} +0 -2
- kreuzberg/_utils/_sync.py +121 -0
- kreuzberg/{_tmp.py → _utils/_tmp.py} +1 -1
- kreuzberg/exceptions.py +25 -0
- kreuzberg/extraction.py +114 -227
- kreuzberg-3.0.1.dist-info/METADATA +178 -0
- kreuzberg-3.0.1.dist-info/RECORD +32 -0
- {kreuzberg-2.1.2.dist-info → kreuzberg-3.0.1.dist-info}/WHEEL +1 -1
- kreuzberg/_html.py +0 -31
- kreuzberg/_pandoc.py +0 -366
- kreuzberg/_pdf.py +0 -190
- kreuzberg/_pptx.py +0 -88
- kreuzberg/_sync.py +0 -74
- kreuzberg/_tesseract.py +0 -231
- kreuzberg/_xlsx.py +0 -88
- kreuzberg-2.1.2.dist-info/METADATA +0 -446
- kreuzberg-2.1.2.dist-info/RECORD +0 -21
- {kreuzberg-2.1.2.dist-info → kreuzberg-3.0.1.dist-info/licenses}/LICENSE +0 -0
- {kreuzberg-2.1.2.dist-info → kreuzberg-3.0.1.dist-info}/top_level.txt +0 -0
kreuzberg/__init__.py
CHANGED
@@ -1,5 +1,10 @@
|
|
1
|
-
from .
|
2
|
-
from .
|
1
|
+
from kreuzberg._ocr._easyocr import EasyOCRConfig
|
2
|
+
from kreuzberg._ocr._paddleocr import PaddleOCRConfig
|
3
|
+
from kreuzberg._ocr._tesseract import TesseractConfig
|
4
|
+
|
5
|
+
from ._ocr._tesseract import PSMMode
|
6
|
+
from ._registry import ExtractorRegistry
|
7
|
+
from ._types import ExtractionConfig, ExtractionResult, Metadata
|
3
8
|
from .exceptions import KreuzbergError, MissingDependencyError, OCRError, ParsingError, ValidationError
|
4
9
|
from .extraction import (
|
5
10
|
batch_extract_bytes,
|
@@ -7,22 +12,31 @@ from .extraction import (
|
|
7
12
|
batch_extract_file,
|
8
13
|
batch_extract_file_sync,
|
9
14
|
extract_bytes,
|
15
|
+
extract_bytes_sync,
|
10
16
|
extract_file,
|
17
|
+
extract_file_sync,
|
11
18
|
)
|
12
19
|
|
13
20
|
__all__ = [
|
21
|
+
"EasyOCRConfig",
|
22
|
+
"ExtractionConfig",
|
14
23
|
"ExtractionResult",
|
24
|
+
"ExtractorRegistry",
|
15
25
|
"KreuzbergError",
|
16
26
|
"Metadata",
|
17
27
|
"MissingDependencyError",
|
18
28
|
"OCRError",
|
19
29
|
"PSMMode",
|
30
|
+
"PaddleOCRConfig",
|
20
31
|
"ParsingError",
|
32
|
+
"TesseractConfig",
|
21
33
|
"ValidationError",
|
22
34
|
"batch_extract_bytes",
|
23
35
|
"batch_extract_bytes_sync",
|
24
36
|
"batch_extract_file",
|
25
37
|
"batch_extract_file_sync",
|
26
38
|
"extract_bytes",
|
39
|
+
"extract_bytes_sync",
|
27
40
|
"extract_file",
|
41
|
+
"extract_file_sync",
|
28
42
|
]
|
kreuzberg/_chunker.py
ADDED
@@ -0,0 +1,51 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
from typing import TYPE_CHECKING
|
4
|
+
|
5
|
+
from kreuzberg import MissingDependencyError
|
6
|
+
from kreuzberg._constants import DEFAULT_MAX_CHARACTERS, DEFAULT_MAX_OVERLAP
|
7
|
+
from kreuzberg._mime_types import MARKDOWN_MIME_TYPE
|
8
|
+
|
9
|
+
if TYPE_CHECKING:
|
10
|
+
from semantic_text_splitter import MarkdownSplitter, TextSplitter
|
11
|
+
|
12
|
+
_chunkers: dict[tuple[int, int, str], MarkdownSplitter | TextSplitter] = {}
|
13
|
+
|
14
|
+
|
15
|
+
def get_chunker(
|
16
|
+
mime_type: str,
|
17
|
+
max_characters: int = DEFAULT_MAX_CHARACTERS,
|
18
|
+
overlap_characters: int = DEFAULT_MAX_OVERLAP,
|
19
|
+
) -> MarkdownSplitter | TextSplitter:
|
20
|
+
"""Creates and returns a Chunker object configured with the given maximum
|
21
|
+
characters per chunk and overlap between chunks.
|
22
|
+
|
23
|
+
Args:
|
24
|
+
mime_type: The mime type of the content.
|
25
|
+
max_characters: Maximum number of characters allowed in each chunk.
|
26
|
+
overlap_characters: Number of characters overlapping between two consecutive chunks.
|
27
|
+
|
28
|
+
Raises:
|
29
|
+
MissingDependencyError: if semantic-text-splitter is not installed.
|
30
|
+
|
31
|
+
Returns:
|
32
|
+
Chunker: A Chunker object configured with the specified maximum
|
33
|
+
characters and overlap.
|
34
|
+
"""
|
35
|
+
key = (max_characters, overlap_characters, mime_type)
|
36
|
+
if key not in _chunkers:
|
37
|
+
try:
|
38
|
+
if mime_type == MARKDOWN_MIME_TYPE:
|
39
|
+
from semantic_text_splitter import MarkdownSplitter
|
40
|
+
|
41
|
+
_chunkers[key] = MarkdownSplitter(max_characters, overlap_characters)
|
42
|
+
else:
|
43
|
+
from semantic_text_splitter import TextSplitter
|
44
|
+
|
45
|
+
_chunkers[key] = TextSplitter(max_characters, overlap_characters)
|
46
|
+
except ImportError as e:
|
47
|
+
raise MissingDependencyError.create_for_package(
|
48
|
+
dependency_group="chunking", functionality="chunking", package_name="semantic-text-splitter"
|
49
|
+
) from e
|
50
|
+
|
51
|
+
return _chunkers[key]
|
kreuzberg/_constants.py
CHANGED
@@ -1,8 +1,7 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
-
from multiprocessing import cpu_count
|
4
3
|
from typing import Final
|
5
4
|
|
6
|
-
DEFAULT_MAX_PROCESSES: Final[int] = cpu_count()
|
7
|
-
MINIMAL_SUPPORTED_TESSERACT_VERSION: Final[int] = 5
|
8
5
|
MINIMAL_SUPPORTED_PANDOC_VERSION: Final[int] = 2
|
6
|
+
DEFAULT_MAX_CHARACTERS: Final[int] = 2000
|
7
|
+
DEFAULT_MAX_OVERLAP: Final[int] = 100
|
File without changes
|
@@ -0,0 +1,92 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
from abc import ABC, abstractmethod
|
4
|
+
from typing import TYPE_CHECKING, ClassVar
|
5
|
+
|
6
|
+
if TYPE_CHECKING:
|
7
|
+
from pathlib import Path
|
8
|
+
|
9
|
+
from kreuzberg import ExtractionResult
|
10
|
+
from kreuzberg._types import ExtractionConfig
|
11
|
+
|
12
|
+
|
13
|
+
class Extractor(ABC):
|
14
|
+
"""Abstract base class for file content extraction.
|
15
|
+
|
16
|
+
This class provides the interface for different types of content extractors.
|
17
|
+
Subclasses are expected to implement the methods for extracting content
|
18
|
+
either asynchronously or synchronously and determining the supported MIME types.
|
19
|
+
|
20
|
+
Attributes:
|
21
|
+
SUPPORTED_MIME_TYPES: The set of supported mime types - all none abstract extractors must implement this.
|
22
|
+
|
23
|
+
Args:
|
24
|
+
mime_type: The MIME type that this extractor handles (e.g., "application/pdf").
|
25
|
+
config: Configuration options for the extraction process.
|
26
|
+
"""
|
27
|
+
|
28
|
+
__slots__ = ("config", "mime_type")
|
29
|
+
|
30
|
+
SUPPORTED_MIME_TYPES: ClassVar[set[str]]
|
31
|
+
|
32
|
+
def __init__(self, mime_type: str, config: ExtractionConfig) -> None:
|
33
|
+
self.mime_type = mime_type
|
34
|
+
self.config = config
|
35
|
+
|
36
|
+
@abstractmethod
|
37
|
+
async def extract_bytes_async(self, content: bytes) -> ExtractionResult:
|
38
|
+
"""Asynchronously extract content from a byte stream.
|
39
|
+
|
40
|
+
Args:
|
41
|
+
content: The byte content to extract.
|
42
|
+
|
43
|
+
Returns:
|
44
|
+
ExtractionResult: The extracted content along with metadata about the extraction.
|
45
|
+
"""
|
46
|
+
|
47
|
+
@abstractmethod
|
48
|
+
async def extract_path_async(self, path: Path) -> ExtractionResult:
|
49
|
+
"""Asynchronously extract content from a file located at the specified path.
|
50
|
+
|
51
|
+
Args:
|
52
|
+
path: The path to the file to process.
|
53
|
+
|
54
|
+
Returns:
|
55
|
+
ExtractionResult: The extracted content along with metadata about the extraction.
|
56
|
+
"""
|
57
|
+
|
58
|
+
@abstractmethod
|
59
|
+
def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
|
60
|
+
"""Synchronously extract content from a byte stream.
|
61
|
+
|
62
|
+
Args:
|
63
|
+
content: The byte content to extract.
|
64
|
+
|
65
|
+
Returns:
|
66
|
+
ExtractionResult: The extracted content along with metadata about the extraction.
|
67
|
+
"""
|
68
|
+
|
69
|
+
@abstractmethod
|
70
|
+
def extract_path_sync(self, path: Path) -> ExtractionResult:
|
71
|
+
"""Synchronously extract content from a file located at the specified path.
|
72
|
+
|
73
|
+
Args:
|
74
|
+
path: The path to the file to process.
|
75
|
+
|
76
|
+
Returns:
|
77
|
+
ExtractionResult: The extracted content along with metadata about the extraction.
|
78
|
+
"""
|
79
|
+
|
80
|
+
@classmethod
|
81
|
+
def supports_mimetype(cls, mime_type: str) -> bool:
|
82
|
+
"""Verify whether the extractor supports the given MIME type.
|
83
|
+
|
84
|
+
Args:
|
85
|
+
mime_type: The MIME type to check (e.g., "application/pdf").
|
86
|
+
|
87
|
+
Returns:
|
88
|
+
bool: True if the MIME type is supported, False otherwise.
|
89
|
+
"""
|
90
|
+
return mime_type in cls.SUPPORTED_MIME_TYPES or any(
|
91
|
+
mime_type.startswith(supported_type) for supported_type in cls.SUPPORTED_MIME_TYPES
|
92
|
+
)
|
@@ -0,0 +1,34 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
from typing import TYPE_CHECKING, ClassVar
|
4
|
+
|
5
|
+
import html_to_markdown
|
6
|
+
from anyio import Path as AsyncPath
|
7
|
+
|
8
|
+
from kreuzberg._extractors._base import Extractor
|
9
|
+
from kreuzberg._mime_types import HTML_MIME_TYPE, MARKDOWN_MIME_TYPE
|
10
|
+
from kreuzberg._types import ExtractionResult
|
11
|
+
from kreuzberg._utils._string import normalize_spaces, safe_decode
|
12
|
+
from kreuzberg._utils._sync import run_sync
|
13
|
+
|
14
|
+
if TYPE_CHECKING:
|
15
|
+
from pathlib import Path
|
16
|
+
|
17
|
+
|
18
|
+
class HTMLExtractor(Extractor):
|
19
|
+
SUPPORTED_MIME_TYPES: ClassVar[set[str]] = {HTML_MIME_TYPE}
|
20
|
+
|
21
|
+
async def extract_bytes_async(self, content: bytes) -> ExtractionResult:
|
22
|
+
return await run_sync(self.extract_bytes_sync, content)
|
23
|
+
|
24
|
+
async def extract_path_async(self, path: Path) -> ExtractionResult:
|
25
|
+
content = await AsyncPath(path).read_bytes()
|
26
|
+
return await run_sync(self.extract_bytes_sync, content)
|
27
|
+
|
28
|
+
def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
|
29
|
+
result = html_to_markdown.convert_to_markdown(safe_decode(content))
|
30
|
+
return ExtractionResult(content=normalize_spaces(result), mime_type=MARKDOWN_MIME_TYPE, metadata={}, chunks=[])
|
31
|
+
|
32
|
+
def extract_path_sync(self, path: Path) -> ExtractionResult:
|
33
|
+
content = path.read_bytes()
|
34
|
+
return self.extract_bytes_sync(content)
|
@@ -0,0 +1,74 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
from typing import TYPE_CHECKING, ClassVar
|
4
|
+
|
5
|
+
import anyio
|
6
|
+
from anyio import Path as AsyncPath
|
7
|
+
|
8
|
+
from kreuzberg._extractors._base import Extractor
|
9
|
+
from kreuzberg._mime_types import IMAGE_MIME_TYPES
|
10
|
+
from kreuzberg._ocr import get_ocr_backend
|
11
|
+
from kreuzberg._utils._tmp import create_temp_file
|
12
|
+
from kreuzberg.exceptions import ValidationError
|
13
|
+
|
14
|
+
if TYPE_CHECKING: # pragma: no cover
|
15
|
+
from collections.abc import Mapping
|
16
|
+
from pathlib import Path
|
17
|
+
|
18
|
+
from kreuzberg._types import ExtractionResult
|
19
|
+
|
20
|
+
|
21
|
+
class ImageExtractor(Extractor):
|
22
|
+
SUPPORTED_MIME_TYPES: ClassVar[set[str]] = IMAGE_MIME_TYPES
|
23
|
+
|
24
|
+
IMAGE_MIME_TYPE_EXT_MAP: ClassVar[Mapping[str, str]] = {
|
25
|
+
"image/bmp": "bmp",
|
26
|
+
"image/x-bmp": "bmp",
|
27
|
+
"image/x-ms-bmp": "bmp",
|
28
|
+
"image/gif": "gif",
|
29
|
+
"image/jpeg": "jpg",
|
30
|
+
"image/pjpeg": "jpg",
|
31
|
+
"image/png": "png",
|
32
|
+
"image/tiff": "tiff",
|
33
|
+
"image/x-tiff": "tiff",
|
34
|
+
"image/jp2": "jp2",
|
35
|
+
"image/jpx": "jpx",
|
36
|
+
"image/jpm": "jpm",
|
37
|
+
"image/mj2": "mj2",
|
38
|
+
"image/webp": "webp",
|
39
|
+
"image/x-portable-anymap": "pnm",
|
40
|
+
"image/x-portable-bitmap": "pbm",
|
41
|
+
"image/x-portable-graymap": "pgm",
|
42
|
+
"image/x-portable-pixmap": "ppm",
|
43
|
+
}
|
44
|
+
|
45
|
+
async def extract_bytes_async(self, content: bytes) -> ExtractionResult:
|
46
|
+
extension = self._get_extension_from_mime_type(self.mime_type)
|
47
|
+
file_path, unlink = await create_temp_file(f".{extension}")
|
48
|
+
await AsyncPath(file_path).write_bytes(content)
|
49
|
+
try:
|
50
|
+
return await self.extract_path_async(file_path)
|
51
|
+
finally:
|
52
|
+
await unlink()
|
53
|
+
|
54
|
+
async def extract_path_async(self, path: Path) -> ExtractionResult:
|
55
|
+
if self.config.ocr_backend is None:
|
56
|
+
raise ValidationError("ocr_backend is None, cannot perform OCR")
|
57
|
+
|
58
|
+
return await get_ocr_backend(self.config.ocr_backend).process_file(path, **self.config.get_config_dict())
|
59
|
+
|
60
|
+
def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
|
61
|
+
return anyio.run(self.extract_bytes_async, content)
|
62
|
+
|
63
|
+
def extract_path_sync(self, path: Path) -> ExtractionResult:
|
64
|
+
return anyio.run(self.extract_path_async, path)
|
65
|
+
|
66
|
+
def _get_extension_from_mime_type(self, mime_type: str) -> str:
|
67
|
+
if mime_type in self.IMAGE_MIME_TYPE_EXT_MAP:
|
68
|
+
return self.IMAGE_MIME_TYPE_EXT_MAP[mime_type]
|
69
|
+
|
70
|
+
for k, v in self.IMAGE_MIME_TYPE_EXT_MAP.items():
|
71
|
+
if k.startswith(mime_type):
|
72
|
+
return v
|
73
|
+
|
74
|
+
raise ValidationError("unsupported mimetype", context={"mime_type": mime_type})
|